In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA as sklearnPCA
from ggplot import *

Utility functions to parse experiment metadata


In [3]:
def parse_barcodes(bcfile, bc_id='BC'):
    res = {}
    with open(bcfile, 'r') as fi:
        for line in fi:
            fields = line.strip().split(',')
            if fields[0].startswith(bc_id):
                res[fields[0]] = fields[1]
    return res


def parse_exp_config(expfile, bc_dict):
    res = []
    fieldnames = ['id', 'sample', 'cond', 'barcode', 'size', 'region', 'Qbit', 'conc', 'dilution']
    with open(expfile) as fi:
        reader = csv.DictReader(fi, fieldnames=fieldnames)
        for rec in reader:
            if rec['id']:
                res.append({
                    'sample': rec['sample'],
                    'bc_id': rec['barcode'],
                    'bc_seq': bc_dict[rec['barcode']],
                    'temp': int(rec['cond'][:2]),
                    'bcm': '+' in rec['cond'],
                })
    return pd.DataFrame.from_records(res)

Read the counts table

This is bacterial mRNA-Seq with samples at 4 different temperatures with or without the addition of BCM. Each condition is sequenced in triplicates. And we are interested in 5'UTRs transcription levels.


In [4]:
bc_dict = parse_barcodes('../data/Lexogen_Sense_RNA-Seq.csv')
exp_df = parse_exp_config('../data/2017-03-09_NextSeq.csv', bc_dict)
agg_utr = pd.DataFrame.from_csv('../data/utr.counts.csv')
agg_utr


Out[4]:
gene TSS end start UTR_length utr_counts sample
0 C0343 2818676 2818728 2818676 52 4 utRho01
1 aaeB 6776526 6776526 6776376 150 20 utRho01
2 aaeR 6778762 6779040 6778762 278 136 utRho01
3 aaeX 6778860 6778860 6778674 186 170 utRho01
4 aat 1854920 1854920 1854864 56 32 utRho01
5 accA 416822 417242 416822 420 2390 utRho01
6 accB 6810280 6810872 6810280 592 13076 utRho01
7 accD 4868030 4868030 4867852 178 740 utRho01
8 aceB 8430804 8430956 8430804 152 102 utRho01
9 aceE 245938 246034 245938 96 764 utRho01
10 ackA 4826918 4826940 4826918 22 2578 utRho01
11 acnA 2670848 2671662 2670848 814 840 utRho01
12 acnB 263038 263230 263038 192 1182 utRho01
13 acpP 2303148 2303230 2303148 82 244 utRho01
14 acpS 5402324 5402324 5401996 328 144 utRho01
15 acrA 971396 971396 971238 158 436 utRho01
16 acrB 969370 969370 968806 564 2634 utRho01
17 acrD 5174950 5175190 5174950 240 98 utRho01
18 acrE 6827404 6827728 6827404 324 0 utRho01
19 acrR 971398 971522 971398 124 262 utRho01
20 acrZ 1589502 1589546 1589502 44 0 utRho01
21 acs 8575190 8575190 8574742 448 70 utRho01
22 actP 8569870 8569870 8569804 66 0 utRho01
23 ada 4620854 4620854 4620810 44 48 utRho01
24 add 3404404 3404466 3404404 62 22 utRho01
25 ade 7687802 7687928 7687802 126 0 utRho01
26 adhE 2596826 2596826 2596242 584 96 utRho01
27 adhP 3107988 3107988 3107676 312 1158 utRho01
28 adiA 8681112 8681112 8681042 70 48 utRho01
29 adiC 8674404 8674404 8674062 342 18 utRho01
... ... ... ... ... ... ... ...
2356 yrfF 7052548 7052938 7052548 390 782 utRho30
2357 yrhB 7169440 7169518 7169440 78 162 utRho30
2358 ysaB 7451804 7451804 7451424 380 324 utRho30
2359 ysgA 8032686 8032686 8032338 348 346 utRho30
2360 ytfB 8857638 8857638 8857434 204 980 utRho30
2361 ytfE 8864018 8864018 8863966 52 310 utRho30
2362 ytfF 8866296 8866296 8866130 166 1342 utRho30
2363 ytfH 8868164 8868226 8868164 62 260 utRho30
2364 ytfI 8875234 8875414 8875234 180 82 utRho30
2365 ytfJ 8878572 8878572 8878524 48 100 utRho30
2366 ytfK 8879022 8879174 8879022 152 290 utRho30
2367 ytfL 8882642 8882642 8882430 212 118 utRho30
2368 ytfQ 8899634 8899924 8899634 290 516 utRho30
2369 ytjB 9249598 9249598 9249578 20 6 utRho30
2370 yzfA 8896588 8896588 8896490 98 154 utRho30
2371 zapA 6111102 6111224 6111102 122 326 utRho30
2372 zapB 8236772 8237030 8236772 258 462 utRho30
2373 zapC 2011700 2011904 2011700 204 144 utRho30
2374 zinT 4082692 4082750 4082692 58 4 utRho30
2375 zipA 5062814 5062814 5062466 348 160 utRho30
2376 zitB 1569716 1569716 1569646 70 92 utRho30
2377 zntA 7212844 7212902 7212844 58 4 utRho30
2378 zntR 6878336 6878336 6878260 76 114 utRho30
2379 znuA 3885322 3885322 3885166 156 62 utRho30
2380 znuC 3885268 3885324 3885268 56 60 utRho30
2381 zraP 8403424 8403424 8403376 48 2 utRho30
2382 zraS 8403810 8403852 8403810 42 82 utRho30
2383 zupT 6364866 6365100 6364866 234 366 utRho30
2384 zur 8520282 8520282 8520006 276 628 utRho30
2385 zwf 3872752 3872752 3872628 124 70 utRho30

71580 rows × 7 columns

Normalize to UTR length


In [5]:
def normalize(df, edf, columns=None):
    '''
    Prepares the UTR dataframe (`df`) for log transformation.
    
    Adds experiment metadata from `edf`.
    Adds pseudocounts to `utr_counts` and `UTR_length`.
    Normalizes counts to UTR length.
    '''
    def pseudo_counts(x):
        return x + 1 if x == 0 else x

    df = df.merge(edf, how='left', on='sample')
    if columns is not None:
        df = df[columns]
    # Add pseudocounts to allow log transform later
    df['utr_counts'] = df['utr_counts'].apply(pseudo_counts)
    df['UTR_length'] = df['UTR_length'].apply(pseudo_counts)
    df['utr_norm'] = df['utr_counts'] / df['UTR_length']
    return df

In [6]:
columns = ['gene', 'TSS', 'start', 'end', 'UTR_length',
               'utr_counts', 'sample', 'bcm', 'temp']
utr = normalize(agg_utr, exp_df, columns)
utr


Out[6]:
gene TSS start end UTR_length utr_counts sample bcm temp utr_norm
0 C0343 2818676 2818676 2818728 52 4 utRho01 False 10 0.076923
1 aaeB 6776526 6776376 6776526 150 20 utRho01 False 10 0.133333
2 aaeR 6778762 6778762 6779040 278 136 utRho01 False 10 0.489209
3 aaeX 6778860 6778674 6778860 186 170 utRho01 False 10 0.913978
4 aat 1854920 1854864 1854920 56 32 utRho01 False 10 0.571429
5 accA 416822 416822 417242 420 2390 utRho01 False 10 5.690476
6 accB 6810280 6810280 6810872 592 13076 utRho01 False 10 22.087838
7 accD 4868030 4867852 4868030 178 740 utRho01 False 10 4.157303
8 aceB 8430804 8430804 8430956 152 102 utRho01 False 10 0.671053
9 aceE 245938 245938 246034 96 764 utRho01 False 10 7.958333
10 ackA 4826918 4826918 4826940 22 2578 utRho01 False 10 117.181818
11 acnA 2670848 2670848 2671662 814 840 utRho01 False 10 1.031941
12 acnB 263038 263038 263230 192 1182 utRho01 False 10 6.156250
13 acpP 2303148 2303148 2303230 82 244 utRho01 False 10 2.975610
14 acpS 5402324 5401996 5402324 328 144 utRho01 False 10 0.439024
15 acrA 971396 971238 971396 158 436 utRho01 False 10 2.759494
16 acrB 969370 968806 969370 564 2634 utRho01 False 10 4.670213
17 acrD 5174950 5174950 5175190 240 98 utRho01 False 10 0.408333
18 acrE 6827404 6827404 6827728 324 1 utRho01 False 10 0.003086
19 acrR 971398 971398 971522 124 262 utRho01 False 10 2.112903
20 acrZ 1589502 1589502 1589546 44 1 utRho01 False 10 0.022727
21 acs 8575190 8574742 8575190 448 70 utRho01 False 10 0.156250
22 actP 8569870 8569804 8569870 66 1 utRho01 False 10 0.015152
23 ada 4620854 4620810 4620854 44 48 utRho01 False 10 1.090909
24 add 3404404 3404404 3404466 62 22 utRho01 False 10 0.354839
25 ade 7687802 7687802 7687928 126 1 utRho01 False 10 0.007937
26 adhE 2596826 2596242 2596826 584 96 utRho01 False 10 0.164384
27 adhP 3107988 3107676 3107988 312 1158 utRho01 False 10 3.711538
28 adiA 8681112 8681042 8681112 70 48 utRho01 False 10 0.685714
29 adiC 8674404 8674062 8674404 342 18 utRho01 False 10 0.052632
... ... ... ... ... ... ... ... ... ... ...
71550 yrfF 7052548 7052548 7052938 390 782 utRho30 True 51 2.005128
71551 yrhB 7169440 7169440 7169518 78 162 utRho30 True 51 2.076923
71552 ysaB 7451804 7451424 7451804 380 324 utRho30 True 51 0.852632
71553 ysgA 8032686 8032338 8032686 348 346 utRho30 True 51 0.994253
71554 ytfB 8857638 8857434 8857638 204 980 utRho30 True 51 4.803922
71555 ytfE 8864018 8863966 8864018 52 310 utRho30 True 51 5.961538
71556 ytfF 8866296 8866130 8866296 166 1342 utRho30 True 51 8.084337
71557 ytfH 8868164 8868164 8868226 62 260 utRho30 True 51 4.193548
71558 ytfI 8875234 8875234 8875414 180 82 utRho30 True 51 0.455556
71559 ytfJ 8878572 8878524 8878572 48 100 utRho30 True 51 2.083333
71560 ytfK 8879022 8879022 8879174 152 290 utRho30 True 51 1.907895
71561 ytfL 8882642 8882430 8882642 212 118 utRho30 True 51 0.556604
71562 ytfQ 8899634 8899634 8899924 290 516 utRho30 True 51 1.779310
71563 ytjB 9249598 9249578 9249598 20 6 utRho30 True 51 0.300000
71564 yzfA 8896588 8896490 8896588 98 154 utRho30 True 51 1.571429
71565 zapA 6111102 6111102 6111224 122 326 utRho30 True 51 2.672131
71566 zapB 8236772 8236772 8237030 258 462 utRho30 True 51 1.790698
71567 zapC 2011700 2011700 2011904 204 144 utRho30 True 51 0.705882
71568 zinT 4082692 4082692 4082750 58 4 utRho30 True 51 0.068966
71569 zipA 5062814 5062466 5062814 348 160 utRho30 True 51 0.459770
71570 zitB 1569716 1569646 1569716 70 92 utRho30 True 51 1.314286
71571 zntA 7212844 7212844 7212902 58 4 utRho30 True 51 0.068966
71572 zntR 6878336 6878260 6878336 76 114 utRho30 True 51 1.500000
71573 znuA 3885322 3885166 3885322 156 62 utRho30 True 51 0.397436
71574 znuC 3885268 3885268 3885324 56 60 utRho30 True 51 1.071429
71575 zraP 8403424 8403376 8403424 48 2 utRho30 True 51 0.041667
71576 zraS 8403810 8403810 8403852 42 82 utRho30 True 51 1.952381
71577 zupT 6364866 6364866 6365100 234 366 utRho30 True 51 1.564103
71578 zur 8520282 8520006 8520282 276 628 utRho30 True 51 2.275362
71579 zwf 3872752 3872628 3872752 124 70 utRho30 True 51 0.564516

71580 rows × 10 columns

-BCM samples


In [7]:
# build expression matrix
X = pd.DataFrame()
samples = []

for sample in set(utr['sample']):
    mask = (utr['sample']==sample) & (utr['bcm']==False)
    if not utr[mask].empty:
        X[sample] = utr[mask]['utr_norm'].values
        samples.append(sample)
    
X_std = StandardScaler().fit_transform(X.values.T)
X_std


Out[7]:
array([[-0.41182215, -0.35767468,  0.36480451, ..., -0.67030221,
        -0.43604798, -0.7975759 ],
       [-0.41182215, -0.35767468, -1.26864851, ...,  0.03550018,
        -0.85532488, -0.08304569],
       [ 3.38858011, -0.35767468, -1.39115748, ...,  3.18640372,
        -0.34075777,  1.20690265],
       ..., 
       [-0.41182215, -0.35767468, -0.57443098, ...,  0.1426309 ,
        -0.06251037,  3.15447171],
       [-0.41182215, -0.35767468,  0.89567674, ..., -0.35521186,
         1.25630607, -0.56361468],
       [ 1.18622686, -0.35767468,  0.77316776, ..., -0.02121608,
         0.19667898,  0.59986813]])

In [8]:
sklearn_pca = sklearnPCA(n_components=10)
Y = sklearn_pca.fit_transform(X_std)
print(Y)
print(sklearn_pca.explained_variance_)
print(sklearn_pca.explained_variance_ratio_)


[[ -6.30080559e+00   3.73924036e+00  -1.85311316e+01   8.79861815e+00
   -2.30236523e+00   5.50160579e+00  -9.13970912e+00   1.75330517e+00
   -7.95371836e-01  -1.49936590e+00]
 [  9.29415499e-01  -6.18164499e+00  -1.72009951e+01   3.90190886e+00
   -5.49493463e-01   7.52610254e-01  -1.34938215e+00  -2.20760005e+00
    1.01838153e-01   1.09447231e+01]
 [ -1.75999761e+01  -3.26667898e+01   2.62626684e+01  -3.34977237e+00
   -7.20226639e+00  -2.31513457e+01  -2.25837793e+01   4.44320645e+00
   -3.02761258e-01  -1.86420082e-02]
 [ -1.14384673e+01  -9.40841117e+00  -1.90377592e+01  -1.18198791e+01
    3.46392884e+00  -1.92667035e+00   3.75850722e+00  -2.00741120e+00
   -1.25170924e+00   2.48891244e+01]
 [ -8.53029322e+00   6.16438977e+00  -1.87802275e+01   6.37857148e+00
   -2.25020072e+00   5.74290050e+00  -1.07597623e+01   6.26823183e-01
   -1.89630839e+00  -6.92440994e+00]
 [ -9.78874751e+00  -2.54150007e+01   1.56823598e+01   6.96971149e+00
   -3.27985907e+00   1.34188260e+01   7.40965258e+00  -3.12682974e+01
    2.03404727e+00  -3.63381541e+00]
 [ -8.85287112e+00   2.66860968e+01   7.71628630e+00   4.43763624e+00
   -8.98042851e-01   5.98833845e-01  -2.39387886e+00   7.29854305e-01
    1.43123719e+01   6.69087755e+00]
 [  3.54256587e+01  -4.56606239e-01  -6.99848301e+00   1.99915608e+01
   -1.59964938e+01  -2.27071931e+01   2.02312021e+01   1.06909320e+00
   -1.76973752e-01  -1.75425170e+00]
 [  5.25240992e+01   7.68853451e-01   1.02664303e+01  -2.31281590e+01
   -1.92723192e+01   1.62651057e+01  -5.94072735e+00   4.06960233e+00
   -1.86289169e+00   1.95025704e-01]
 [ -7.99866732e+00   4.65800825e+00  -2.05751541e+01   9.64108660e+00
   -3.11100542e+00   6.44702476e+00  -1.26195813e+01   2.70665691e+00
   -6.65875165e+00  -1.06594761e+01]
 [  4.48813417e+01   1.17560631e+00   6.25214231e+00   3.13785712e+00
    3.61479679e+01  -4.86635758e+00  -7.39432182e+00  -3.59829611e+00
    6.60974656e-01  -1.67956375e+00]
 [ -1.56183019e+01   3.27010705e+01   1.57258209e+01  -2.39271922e+00
   -1.24396235e+00   3.60139174e-02   2.84315780e+00   2.24028420e+00
    2.13476234e+01  -2.61147263e+00]
 [ -1.63770971e+01  -1.42055558e+01  -1.94036942e+01  -2.83652115e+01
    7.34057762e+00  -8.77246258e+00   1.59174890e+01   1.81838557e+00
    4.49529756e+00  -1.54266350e+01]
 [ -1.84180228e+01   3.85455760e+01   1.92714166e+01  -6.26492351e+00
    1.86267009e+00  -5.30749058e+00   6.67070495e+00  -3.99260993e+00
   -2.66448186e+01   5.31618792e-01]
 [ -1.28372652e+01  -2.61048327e+01   1.93503200e+01   1.20637139e+01
    7.29086404e+00   1.79685991e+01   1.53504285e+01   2.36170034e+01
   -3.36256650e+00   9.56262920e-01]]
[ 522.42345848  404.41079964  289.06982031  158.99936562  142.78822873
  137.15657394  132.40148297  108.69059412   97.35588238   80.86550314]
[ 0.22743729  0.17606043  0.12584668  0.06922045  0.06216292  0.05971118
  0.05764105  0.0473185   0.04238393  0.03520483]

In [9]:
vdf = pd.DataFrame()
vdf['PC'] = [(i+1) for i,x in enumerate(sklearn_pca.explained_variance_ratio_)]
vdf['var'] = sklearn_pca.explained_variance_ratio_

g = ggplot(vdf, aes(x='PC', y='var')) \
    + geom_point(size=10) \
    + ylab('Explained variance') \
    + ggtitle('Unfiltered -BCM')
print(g)


<ggplot: (-9223363304580426608)>

In [11]:
pca_df = pd.DataFrame()
pca_df['cond'] = ['%doC' % exp_df[exp_df['sample']==sample]['temp'] for sample in samples]
pca_df['PC1'] = Y[:,0]
pca_df['PC2'] = Y[:,1]

pca_df


Out[11]:
cond PC1 PC2
0 25oC -6.300806 3.739240
1 37oC 0.929415 -6.181645
2 44oC -17.599976 -32.666790
3 37oC -11.438467 -9.408411
4 25oC -8.530293 6.164390
5 44oC -9.788748 -25.415001
6 10oC -8.852871 26.686097
7 51oC 35.425659 -0.456606
8 51oC 52.524099 0.768853
9 25oC -7.998667 4.658008
10 51oC 44.881342 1.175606
11 10oC -15.618302 32.701070
12 37oC -16.377097 -14.205556
13 10oC -18.418023 38.545576
14 44oC -12.837265 -26.104833

In [12]:
g = ggplot(pca_df, aes(x='PC1', y='PC2', color='cond')) \
    + geom_point(size=10) \
    + ggtitle('Unfiltered -BCM')
print(g)


<ggplot: (-9223363304582241462)>

+BCM samples


In [13]:
# build expression matrix

X = pd.DataFrame()
samples = []

for sample in set(utr['sample']):
    mask = (utr['sample']==sample) & (utr['bcm']==True)
    if not utr[mask].empty:
        X[sample] = utr[mask]['utr_norm'].values
        samples.append(sample)
    
X_std = StandardScaler().fit_transform(X.values.T)
X_std


Out[13]:
array([[-0.52008826, -1.01091213, -1.23792619, ..., -0.93258482,
         0.76962785,  0.0732628 ],
       [-0.52008826, -1.01091213, -0.61442217, ...,  1.67582378,
        -0.78130151, -0.85661123],
       [ 1.96215118,  1.31653673, -0.00663674, ..., -0.24348713,
        -0.09026242,  3.37008893],
       ..., 
       [-0.52008826,  1.17547922,  1.15653883, ...,  0.62746817,
        -0.34979294,  0.22119731],
       [-0.52008826, -1.01091213, -1.1855309 , ..., -1.01014174,
         1.45128631, -0.41280772],
       [ 2.43495869,  0.63140027,  2.03154027, ...,  0.38944866,
        -0.44359915, -0.0535382 ]])

In [14]:
sklearn_pca = sklearnPCA(n_components=10)
Y = sklearn_pca.fit_transform(X_std)

print(Y)
print(sklearn_pca.explained_variance_)
print(sklearn_pca.explained_variance_ratio_)


[[ -3.37130881e+01   1.45851207e+00  -1.50862874e+01   7.44624740e+00
   -4.59009537e+00   3.41488872e+00  -1.13385653e+00   2.40747633e-01
   -1.22180934e+00   2.05261769e+00]
 [ -2.12086124e+01  -1.98830920e+01   3.75730772e+00  -2.13009791e+01
    1.18471279e+00  -4.29298238e+00   1.59630324e+00   1.41539866e+00
    7.34173664e+00  -2.32817727e+01]
 [ -1.49913955e+00  -1.14836795e+01   2.25292058e+01   1.89403268e+01
   -7.10782450e+00  -2.68500257e+01   3.60198207e+00  -9.36635082e+00
   -1.97976650e+00   2.66883480e+00]
 [  3.96839817e+01  -6.48702588e+00  -1.33659410e+01   2.54137225e+00
    8.57790929e+00  -1.31991483e+00  -1.00540944e+00   1.14373662e+00
   -2.65532381e+01  -9.11077593e+00]
 [  7.12197320e+00  -1.53701758e+01   2.50897415e+01   1.24911798e+01
   -1.03378225e+01   1.22931627e+01   2.09553576e+00   2.46703191e+01
   -9.55383614e-01  -3.65992534e-01]
 [ -2.47509806e+01  -9.52081998e+00   6.06360934e-01  -9.91147698e+00
    8.49525176e+00  -4.19711051e+00  -9.18443433e-01   2.10811422e-01
    2.75447540e+00  -1.89707697e+00]
 [  2.05065646e+01   5.18780897e+01   1.48239121e+01  -1.99096312e+01
   -1.78205917e+01   2.77971883e+00   2.25903297e+00  -5.77262475e+00
   -3.36518392e+00  -1.42890617e+00]
 [ -7.37895274e+00   3.14524814e+01  -1.53248587e+00   8.45269632e+00
    1.62903287e+01  -2.86310145e+00  -2.97026252e+00   5.89293771e+00
    3.31983300e+00  -1.32239116e+00]
 [ -8.78856974e+00  -1.96462669e+01   1.04786501e+01  -2.71477743e+01
    1.13018073e+01   2.54687602e-01  -1.70625294e+00   8.31674727e-01
   -5.79318851e+00   2.08666240e+01]
 [ -3.57625874e+01  -4.95493660e+00  -1.79735303e+01   6.98976532e-01
   -2.07800007e+01   2.81147455e+00   1.15278649e+00  -3.43364181e+00
   -4.53323771e+00   3.65106073e+00]
 [  7.85176166e+00  -1.19658145e+01   1.62812788e+01   1.26490068e+01
    7.75591539e+00   2.13983233e+01  -4.85400865e+00  -2.21908282e+01
    3.04025964e+00  -2.76640785e+00]
 [ -8.22252517e+00   3.07492327e+01   6.82653705e-02   7.81845317e+00
    1.58126013e+01  -2.91932101e+00  -8.55182776e-01   5.55095481e+00
    5.08618231e+00   1.68767722e+00]
 [  4.74180259e+01  -9.08036850e+00  -1.62634715e+01  -1.35681210e-01
    1.45628099e+00   1.26294188e+00   2.74038292e+01  -1.12232518e+00
    1.08444851e+01   4.02323791e+00]
 [ -3.41735772e+01   3.26795451e+00  -1.49952207e+01   8.82829573e+00
   -2.14583285e+00   3.31467391e+00   2.15015872e-02  -2.94061093e-01
    1.29858919e-01   2.31296945e+00]
 [  5.29157260e+01  -1.04140908e+01  -1.44177855e+01  -1.46101195e+00
   -8.09263992e+00  -5.08741562e+00  -2.46875550e+01   2.22325112e+00
    1.18849767e+01   2.91030154e+00]]
[ 800.23889068  418.64754873  211.67005979  175.67925816  123.36736185
   96.68418696   95.06387168   87.32222925   76.2219727    75.54400493]
[ 0.34256802  0.17921556  0.09061218  0.07520516  0.05281137  0.04138878
  0.04069515  0.03738109  0.03262927  0.03233904]

In [15]:
vdf = pd.DataFrame()
vdf['PC'] = [(i+1) for i,x in enumerate(sklearn_pca.explained_variance_ratio_)]
vdf['var'] = sklearn_pca.explained_variance_ratio_

g = ggplot(vdf, aes(x='PC', y='var')) \
    + geom_point(size=10) \
    + ylab('Explained variance') \
    + ggtitle('Unfiltered +BCM')
print(g)


<ggplot: (8732266129072)>

In [16]:
pca_df = pd.DataFrame()
pca_df['cond'] = ['%doC' % exp_df[exp_df['sample']==sample]['temp'] for sample in samples]
pca_df['PC1'] = Y[:,0]
pca_df['PC2'] = Y[:,1]

pca_df


Out[16]:
cond PC1 PC2
0 10oC -33.713088 1.458512
1 25oC -21.208612 -19.883092
2 37oC -1.499140 -11.483680
3 44oC 39.683982 -6.487026
4 37oC 7.121973 -15.370176
5 25oC -24.750981 -9.520820
6 51oC 20.506565 51.878090
7 51oC -7.378953 31.452481
8 25oC -8.788570 -19.646267
9 10oC -35.762587 -4.954937
10 37oC 7.851762 -11.965814
11 51oC -8.222525 30.749233
12 44oC 47.418026 -9.080368
13 10oC -34.173577 3.267955
14 44oC 52.915726 -10.414091

In [17]:
g = ggplot(pca_df, aes(x='PC1', y='PC2', color='cond')) \
    + geom_point(size=10) \
    + ggtitle('Unfiltered +BCM')
print(g)


<ggplot: (8732266126256)>

In [ ]: